library(tidyverse)
library(ggdendro)
library(here)
library(plotly)
library(cluster)
library(ggfortify)
library(broom)
source(here::here("code/plota_solucoes_hclust.R"))
games = read_csv("../data/international-football.csv")
Parsed with column specification:
cols(
date = col_date(format = ""),
home_team = col_character(),
away_team = col_character(),
home_score = col_integer(),
away_score = col_integer(),
tournament = col_character(),
city = col_character(),
country = col_character(),
neutral = col_logical()
)
jogos_brasil = games %>%
filter(home_team == "Brazil" | away_team == "Brazil") %>%
mutate(time1 = "Brazil",
time2 = if_else(home_team == "Brazil", away_team, home_team),
score1 = if_else(home_team == "Brazil", home_score, away_score),
score2 = if_else(home_team == "Brazil", away_score, home_score)
)
historicos = jogos_brasil %>%
group_by(time2) %>%
summarise(
jogos = n(),
ganhou = sum(score1 > score2) / n(),
empatou = sum(score1 == score2) / n(),
perdeu = sum(score1 < score2) / n()
)
p <- historicos %>%
filter(jogos > 2) %>%
ggplot(aes(x = ganhou,
y = jogos,
text = paste("Seleção:", time2,
"\nVitorio:",ganhou,
"\nPerdeu:",perdeu,
"\nEmpatou:",empatou))) +
geom_point(size = 4,
color = "#938BA1") +
labs(y = "Quantidade de Jogos",
x = "Proporção de Vitorias.")
ggplotly(p, tooltip = "text") %>%
layout(autosize = F)
We recommend that you use the dev version of ggplot2 with `ggplotly()`
Install it with: `devtools::install_github('hadley/ggplot2')`
- Podemos observar que o Brasil tende a ter um numero maior de vitorias.
- Os times que o Brasil mais jogou são times da america latina podemos associar isto aos campeonatos classificatorio para copa do mundo.
- Para está analise consideramos apenas seleções que o Brasil jogou mais do que 2 vezes.
agrupamento_h = historicos %>%
filter(jogos > 2) %>%
as.data.frame() %>%
column_to_rownames("time2") %>%
select(ganhou) %>%
dist(method = "euclidian") %>%
hclust(method = "ward.D")
ggdendrogram(agrupamento_h, rotate = T, size = 2, theme_dendro = F) +
labs(y = "Dissimilaridade", x = "", title = "Dendrograma")

- Podemos observar em termo de Dissimilaridade a divisão de grupos bem proximas.
Aplicando o K-means
Com o intuito de busca grupos onde se encaixa os adversarios da seleção Brasileira podemos observar utilizando o algoritimo k-means.
*O algoritmo do K-Means pode ser descrito da seguinte maneira: +1: Escolher k distintos valores para centros dos grupos (possivelmente, de forma aleatória) +2: Associar cada ponto ao centro mais próximo +3: Recalcular o centro de cada grupo +4: Repetir os passos 2-3 até nenhum elemento mudar de grupo.
historico_t = historicos %>%
filter(jogos > 2) %>%
mutate(jogos = log10(jogos))
atribuicoes = tibble(k = 1:6) %>%
group_by(k) %>%
do(kmeans(select(historico_t,ganhou, jogos),
centers = .$k,
nstart = 10) %>% augment(historico_t)) # alterne entre filmes e filmes_t no augment
Unequal factor levels: coercing to characterbinding character and factor vector, coercing into character vectorbinding character and factor vector, coercing into character vectorbinding character and factor vector, coercing into character vectorbinding character and factor vector, coercing into character vectorbinding character and factor vector, coercing into character vectorbinding character and factor vector, coercing into character vector
atribuicoes_long = atribuicoes %>%
gather(key = "variavel", value = "valor", -time2, -k, -.cluster, -jogos)
atribuicoes %>%
ggplot(aes(x = ganhou, y = jogos, label = time2, colour = .cluster)) +
geom_point() +
#geom_text() +
facet_wrap(~ k) + scale_y_log10()

# A silhoueta
dists = select(historico_t, ganhou, jogos) %>% dist()
km = kmeans(select(historico_t, ganhou, jogos),
centers = 4,
nstart = 10)
silhouette(km$cluster, dists) %>%
plot(col = RColorBrewer::brewer.pal(4, "Set2"))

- Podemos observar que a divisão em 5 grupo como a ideal.
- Considerando o uso do k-means podemos verificar qual o melhor valor de k de acordo com o gráfico abaixo.
set.seed(123)
explorando_k = tibble(k = 1:15) %>%
group_by(k) %>%
do(
kmeans(select(historico_t, -time2),
centers = .$k,
nstart = 20) %>% glance()
)
explorando_k %>%
ggplot(aes(x = k, y = betweenss / totss)) +
geom_line() +
geom_point()

d.scaled.km.long = km %>%
augment(historico_t) %>%
gather(key = "variável",
value = "valor",
-time2, -.cluster)
d.scaled.km.long %>%
ggplot(aes(x=`variável`, y=valor, group=time2, colour=.cluster)) +
geom_line(alpha = .5) +
facet_wrap(~ .cluster)

- Vamos identificar esses grupos:
- O grupo 1 podemos verificar que é constituido por times onde o Brasil mais jogou contra.
- O grupo 2 observamos que o Brasil jogou um número considerado de partidas e tem um número de vitorias bem contra esses times.
- O grupo 3 Consideramos que é o grupo onde tem seleções que o Brasil mais perdeu.
- O grupo 4 observamos que é os “fregueses” do Brasil pós é as seleções que mais perderam para o Brasil.
LS0tCnRpdGxlOiAiUiBOb3RlYm9vayIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKCmBgYHtyfQpsaWJyYXJ5KHRpZHl2ZXJzZSkKbGlicmFyeShnZ2RlbmRybykKbGlicmFyeShoZXJlKQpsaWJyYXJ5KHBsb3RseSkKbGlicmFyeShjbHVzdGVyKQpsaWJyYXJ5KGdnZm9ydGlmeSkKbGlicmFyeShicm9vbSkKc291cmNlKGhlcmU6OmhlcmUoImNvZGUvcGxvdGFfc29sdWNvZXNfaGNsdXN0LlIiKSkKYGBgCgpgYGB7cn0KZ2FtZXMgPSByZWFkX2NzdigiLi4vZGF0YS9pbnRlcm5hdGlvbmFsLWZvb3RiYWxsLmNzdiIpCgpqb2dvc19jb3BhID0gZ2FtZXMgJT4lIAogICAgZmlsdGVyKGhvbWVfdGVhbSA9PSAiQnJhemlsIiB8IGF3YXlfdGVhbSA9PSAiQnJhemlsIikgJT4lIAogICAgbXV0YXRlKHRpbWUxID0gIkJyYXppbCIsIAogICAgICAgICAgIHRpbWUyID0gaWZfZWxzZShob21lX3RlYW0gPT0gIkJyYXppbCIsIGF3YXlfdGVhbSwgaG9tZV90ZWFtKSwgCiAgICAgICAgICAgc2NvcmUxID0gaWZfZWxzZShob21lX3RlYW0gPT0gIkJyYXppbCIsIGhvbWVfc2NvcmUsIGF3YXlfc2NvcmUpLAogICAgICAgICAgIHNjb3JlMiA9IGlmX2Vsc2UoaG9tZV90ZWFtID09ICJCcmF6aWwiLCBhd2F5X3Njb3JlLCBob21lX3Njb3JlKSAKICAgICkgCgpoaXN0b3JpY29zID0gam9nb3NfYnJhc2lsICU+JSAKICAgIGdyb3VwX2J5KHRpbWUyKSAlPiUgCiAgICBzdW1tYXJpc2UoCiAgICAgICAgam9nb3MgPSBuKCksCiAgICAgICAgZ2FuaG91ID0gc3VtKHNjb3JlMSA+IHNjb3JlMikgLyBuKCksIAogICAgICAgIGVtcGF0b3UgPSBzdW0oc2NvcmUxID09IHNjb3JlMikgLyBuKCksCiAgICAgICAgcGVyZGV1ID0gc3VtKHNjb3JlMSA8IHNjb3JlMikgLyBuKCkKICAgICAgICApCmBgYApgYGB7cn0KcCA8LSBoaXN0b3JpY29zICU+JQogIGZpbHRlcihqb2dvcyA+IDIpICU+JQogIGdncGxvdChhZXMoeCA9IGdhbmhvdSwgCiAgICAgICAgICAgICB5ID0gam9nb3MsIAogICAgICAgICAgICAgdGV4dCA9IHBhc3RlKCJTZWxlw6fDo286IiwgdGltZTIsCiAgICAgICAgICAgICAgICAgICAgICAgICAgIlxuVml0b3JpbzoiLGdhbmhvdSwKICAgICAgICAgICAgICAgICAgICAgICAgICAiXG5QZXJkZXU6IixwZXJkZXUsCiAgICAgICAgICAgICAgICAgICAgICAgICAgIlxuRW1wYXRvdToiLGVtcGF0b3UpKSkgKyAKICBnZW9tX3BvaW50KHNpemUgPSA0LAogICAgICAgICAgICAgY29sb3IgPSAiIzkzOEJBMSIpICsKICBsYWJzKHkgPSAiUXVhbnRpZGFkZSBkZSBKb2dvcyIsIAogICAgICAgeCA9ICJQcm9wb3LDp8OjbyBkZSBWaXRvcmlhcy4iKQoKZ2dwbG90bHkocCwgdG9vbHRpcCA9ICJ0ZXh0IikgJT4lCiAgICBsYXlvdXQoYXV0b3NpemUgPSBGKQpgYGAKKiBQb2RlbW9zIG9ic2VydmFyIHF1ZSBvIEJyYXNpbCB0ZW5kZSBhIHRlciB1bSBudW1lcm8gbWFpb3IgZGUgdml0b3JpYXMuCiogT3MgdGltZXMgcXVlIG8gQnJhc2lsIG1haXMgam9nb3Ugc8OjbyB0aW1lcyBkYSBhbWVyaWNhIGxhdGluYSBwb2RlbW9zIGFzc29jaWFyIGlzdG8gYW9zIGNhbXBlb25hdG9zIGNsYXNzaWZpY2F0b3JpbyBwYXJhIGNvcGEgZG8gbXVuZG8uCiogUGFyYSBlc3TDoSBhbmFsaXNlIGNvbnNpZGVyYW1vcyBhcGVuYXMgc2VsZcOnw7VlcyBxdWUgbyBCcmFzaWwgam9nb3UgbWFpcyBkbyBxdWUgMiB2ZXplcy4KCmBgYHtyfQphZ3J1cGFtZW50b19oID0gaGlzdG9yaWNvcyAlPiUgCiAgICBmaWx0ZXIoam9nb3MgPiAyKSAlPiUKICAgIGFzLmRhdGEuZnJhbWUoKSAlPiUgCiAgICBjb2x1bW5fdG9fcm93bmFtZXMoInRpbWUyIikgJT4lIAogICAgc2VsZWN0KGdhbmhvdSkgJT4lCiAgICBkaXN0KG1ldGhvZCA9ICJldWNsaWRpYW4iKSAlPiUgCiAgICBoY2x1c3QobWV0aG9kID0gIndhcmQuRCIpCgpnZ2RlbmRyb2dyYW0oYWdydXBhbWVudG9faCwgcm90YXRlID0gVCwgc2l6ZSA9IDIsIHRoZW1lX2RlbmRybyA9IEYpICsgCiAgICBsYWJzKHkgPSAiRGlzc2ltaWxhcmlkYWRlIiwgeCA9ICIiLCB0aXRsZSA9ICJEZW5kcm9ncmFtYSIpCmBgYAoqIFBvZGVtb3Mgb2JzZXJ2YXIgZW0gdGVybW8gZGUgRGlzc2ltaWxhcmlkYWRlIGEgZGl2aXPDo28gZGUgZ3J1cG9zIGJlbSBwcm94aW1hcy4KCiMjIEFwbGljYW5kbyBvIEstbWVhbnMKQ29tIG8gaW50dWl0byBkZSBidXNjYSBncnVwb3Mgb25kZSBzZSBlbmNhaXhhIG9zIGFkdmVyc2FyaW9zIGRhIHNlbGXDp8OjbyBCcmFzaWxlaXJhIHBvZGVtb3Mgb2JzZXJ2YXIgdXRpbGl6YW5kbyBvIGFsZ29yaXRpbW8gay1tZWFucy4KCipPIGFsZ29yaXRtbyBkbyBLLU1lYW5zIHBvZGUgc2VyIGRlc2NyaXRvIGRhIHNlZ3VpbnRlIG1hbmVpcmE6CiAgKzE6IEVzY29saGVyIGsgZGlzdGludG9zIHZhbG9yZXMgcGFyYSBjZW50cm9zIGRvcyBncnVwb3MgKHBvc3NpdmVsbWVudGUsIGRlIGZvcm1hIGFsZWF0w7NyaWEpCiAgKzI6IEFzc29jaWFyIGNhZGEgcG9udG8gYW8gY2VudHJvIG1haXMgcHLDs3hpbW8KICArMzogUmVjYWxjdWxhciBvIGNlbnRybyBkZSBjYWRhIGdydXBvCiAgKzQ6IFJlcGV0aXIgb3MgcGFzc29zIDItMyBhdMOpIG5lbmh1bSBlbGVtZW50byBtdWRhciBkZSBncnVwby4KCgpgYGB7cn0KaGlzdG9yaWNvX3QgPSBoaXN0b3JpY29zICU+JSAKICAgIGZpbHRlcihqb2dvcyA+IDIpICU+JSAKICAgIG11dGF0ZShqb2dvcyA9IGxvZzEwKGpvZ29zKSkgCgphdHJpYnVpY29lcyA9IHRpYmJsZShrID0gMTo2KSAlPiUgCiAgICBncm91cF9ieShrKSAlPiUgCiAgICBkbyhrbWVhbnMoc2VsZWN0KGhpc3Rvcmljb190LGdhbmhvdSwgam9nb3MpLCAKICAgICAgICAgICAgICBjZW50ZXJzID0gLiRrLCAKICAgICAgICAgICAgICBuc3RhcnQgPSAxMCkgJT4lIGF1Z21lbnQoaGlzdG9yaWNvX3QpKSAjIGFsdGVybmUgZW50cmUgZmlsbWVzIGUgZmlsbWVzX3Qgbm8gYXVnbWVudCAgCgphdHJpYnVpY29lc19sb25nID0gYXRyaWJ1aWNvZXMgJT4lIAogICAgZ2F0aGVyKGtleSA9ICJ2YXJpYXZlbCIsIHZhbHVlID0gInZhbG9yIiwgLXRpbWUyLCAtaywgLS5jbHVzdGVyLCAtam9nb3MpIAoKYXRyaWJ1aWNvZXMgJT4lCiAgICBnZ3Bsb3QoYWVzKHggPSBnYW5ob3UsIHkgPSBqb2dvcywgbGFiZWwgPSB0aW1lMiwgY29sb3VyID0gLmNsdXN0ZXIpKSArIAogICAgZ2VvbV9wb2ludCgpICsgCiAgICAjZ2VvbV90ZXh0KCkgKyAKICAgIGZhY2V0X3dyYXAofiBrKSArIHNjYWxlX3lfbG9nMTAoKQoKIyBBIHNpbGhvdWV0YQpkaXN0cyA9IHNlbGVjdChoaXN0b3JpY29fdCwgZ2FuaG91LCBqb2dvcykgJT4lIGRpc3QoKQprbSA9IGttZWFucyhzZWxlY3QoaGlzdG9yaWNvX3QsIGdhbmhvdSwgam9nb3MpLCAKICAgICAgICAgICAgY2VudGVycyA9IDQsIAogICAgICAgICAgICBuc3RhcnQgPSAxMCkgCgpzaWxob3VldHRlKGttJGNsdXN0ZXIsIGRpc3RzKSAlPiUgCiAgICBwbG90KGNvbCA9IFJDb2xvckJyZXdlcjo6YnJld2VyLnBhbCg0LCAiU2V0MiIpKQpgYGAKCiogUG9kZW1vcyBvYnNlcnZhciBxdWUgYSBkaXZpc8OjbyBlbSA1IGdydXBvIGNvbW8gYSBpZGVhbC4KKiBDb25zaWRlcmFuZG8gbyB1c28gZG8gay1tZWFucyBwb2RlbW9zIHZlcmlmaWNhciBxdWFsIG8gbWVsaG9yIHZhbG9yIGRlIGsgZGUgYWNvcmRvIGNvbSBvIGdyw6FmaWNvIGFiYWl4by4KCmBgYHtyfQpzZXQuc2VlZCgxMjMpCmV4cGxvcmFuZG9fayA9IHRpYmJsZShrID0gMToxNSkgJT4lIAogICAgZ3JvdXBfYnkoaykgJT4lIAogICAgZG8oCiAgICAgICAga21lYW5zKHNlbGVjdChoaXN0b3JpY29fdCwgLXRpbWUyKSwgCiAgICAgICAgICAgICAgIGNlbnRlcnMgPSAuJGssIAogICAgICAgICAgICAgICBuc3RhcnQgPSAyMCkgJT4lIGdsYW5jZSgpCiAgICApCgpleHBsb3JhbmRvX2sgJT4lIAogICAgZ2dwbG90KGFlcyh4ID0gaywgeSA9IGJldHdlZW5zcyAvIHRvdHNzKSkgKyAKICAgIGdlb21fbGluZSgpICsgCiAgICBnZW9tX3BvaW50KCkKYGBgCgoqIFBvZGVtb3Mgb2JzZXJ2YXIgcXVlIGsgZXN0w6EgZW50cmUgb3MgdmFsb3JlcyA1IGUgNi4KKiBDb25zaWRlcmFtb3MgcXVlIMOhIHBhcnRpciBkbyB2YWxvciA2IG7Do28gb2NvcnJlIHVtYSBtdWRhbsOnYSBjb25zaWRlcmFkYSBuYSBsaW5oYSBkYSBjdXJ2YSBlbnTDo28gcG9kZW1vcyBjb25zaWRlcmFyICoqayA9IDUqKi4KCiogT3JnYW5pemFuZG8gb3MgZGFkb3Mgc2VndW5kbyBvcyBncnVwb3MgaWRlbnRpZmljYWRvczoKYGBge3J9CmQuc2NhbGVkLmttLmxvbmcgPSBrbSAlPiUgCiAgICBhdWdtZW50KGhpc3Rvcmljb190KSAlPiUKICAgIGdhdGhlcihrZXkgPSAidmFyacOhdmVsIiwgCiAgICAgICAgICAgdmFsdWUgPSAidmFsb3IiLCAKICAgICAgICAgICAtdGltZTIsIC0uY2x1c3RlcikKYGBgCgpgYGB7cn0KZC5zY2FsZWQua20ubG9uZyAlPiUgCiAgICBnZ3Bsb3QoYWVzKHg9YHZhcmnDoXZlbGAsIHk9dmFsb3IsIGdyb3VwPXRpbWUyLCBjb2xvdXI9LmNsdXN0ZXIpKSArIAogICAgZ2VvbV9saW5lKGFscGhhID0gLjUpICsgCiAgICBmYWNldF93cmFwKH4gLmNsdXN0ZXIpIApgYGAKCiogVmFtb3MgaWRlbnRpZmljYXIgZXNzZXMgZ3J1cG9zOgogICsgTyBncnVwbyAxIHBvZGVtb3MgdmVyaWZpY2FyIHF1ZSDDqSBjb25zdGl0dWlkbyBwb3IgdGltZXMgb25kZSBvIEJyYXNpbCBtYWlzIGpvZ291IGNvbnRyYS4KICArIE8gZ3J1cG8gMiBvYnNlcnZhbW9zIHF1ZSBvIEJyYXNpbCBqb2dvdSB1bSBuw7ptZXJvIGNvbnNpZGVyYWRvIGRlIHBhcnRpZGFzIGUgdGVtIHVtIG7Dum1lcm8gZGUgdml0b3JpYXMgYmVtIGNvbnRyYSBlc3NlcyB0aW1lcy4KICArIE8gZ3J1cG8gMyBDb25zaWRlcmFtb3MgcXVlIMOpIG8gZ3J1cG8gb25kZSB0ZW0gc2VsZcOnw7VlcyBxdWUgbyBCcmFzaWwgbWFpcyBwZXJkZXUuCiAgKyBPIGdydXBvIDQgb2JzZXJ2YW1vcyBxdWUgw6kgb3MgImZyZWd1ZXNlcyIgZG8gQnJhc2lsIHDDs3Mgw6kgYXMgc2VsZcOnw7VlcyBxdWUgbWFpcyBwZXJkZXJhbSBwYXJhIG8gQnJhc2lsLgo=